# Load packages and install them if necessary (automatic)
need = c("dplyr",  #data structuring/cleaning
         "haven",  #load data
         "here",   #setting wd
         "scales", #convenience function to recode to 0-1
         "tidyr")  #data structuring/cleaning
# Loading packages (similar to https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them)
have = need %in% rownames(installed.packages())
if ( any(!have) ) { install.packages( need[!have] ) }
pack <- lapply(need, library, character.only = TRUE)
rm(have, need, pack)

# Simple function for data cleaning 
convert <- function(x) {
  as.numeric(as.character(x))
}

# Set wd
# Please make sure your wd is set correctly to load the data
here::here()

# Load GSS data (note: takes a while)
gss <- read_dta("D:/dv_rep/Datasets/gss7221_r2.dta")

# Select relevant variables and transform data
gss_Dat <- gss %>%
  dplyr::select(
    year,      #year of survey
    cohort,    #year of birth
    fefam,     #gender: woman take care of the family
    fechld,    #gender: relationship when mother works
    fepresch,  #gender: preschool child suffers if mother works
    abdefect,  #abortion: defect at birth
    abnomore,  #abortion: no more children
    abhlth,    #abortion: health of mother
    abpoor,    #abortion: financial
    abrape,    #abortion: rape
    absingle,  #abortion: single
    premarsx,  #pre-marital sex
    premars1,  #pre-marital sex: alternative format
    homosex,   #same-sex sexual relations
    homosex1,  #same-sex sexual relations: alternative format
    divlaw,    #divorce laws
    helppoor,  #government aid: poor people
    helpnot,   #government aid: do more/private businesses
    helpsick,  #government aid: health care
    helpblk,   #government aid: aid to Blacks
    natrace,   #national spending: aid to Blacks
    natsoc,    #national spending: social security
    eqwlth,    #equal wealth (re)distribution
    partyid,   #party identification
    polviews,  #political self-ID
    educ,      #highest year of education completed, recode to college later
    attend,    #church attendance, recode later
    race,      #interviewer coded, recode to white/other later
    relig,     #religious preference, recode to protestant later
    age) %>%   #age respondent
  mutate_all(haven::zap_labels) %>% #zap labels to be sure: in some versions of R the haven labels of race and gender return error messages otherwise
  filter(!is.na(attend) & !is.na(educ) & !is.na(race) & !is.na(relig) & !is.na(partyid) & !is.na(age)) %>% #filter missings age and controls
  mutate(across(c(starts_with('ab'), starts_with('premars'), homosex1, fepresch, fefam), as.numeric), #recode to numeric
         across(c(starts_with('ab'), homosex, fefam, fepresch), ~as.numeric(max(., na.rm = TRUE) - . + 1)), #reverse code. note! not fechld!
         across(c(starts_with('ab')), ~recode(., '2' = 0)), #2 to 0
         divlaw = recode(convert(divlaw), `3` = 2, `2` = 3), #note! recode stay as is (3) as 2
         across(c(starts_with('premars')), ~recode(., `4` = 1, `3` = 2, `2` = 3, `1` = 4)), #reverse
         across(c(starts_with("nat"), starts_with("help"), homosex, polviews, eqwlth), ~ . - 1),
         homosex = na_if(homosex, 0), #missings
         premars = coalesce(premarsx, premars1), #coalesce versions
         homosexC = coalesce(homosex, homosex1), #coalesce versions
         partyid = as.numeric(partyid) + 1,
         college = case_when(educ > 12 ~ 1, TRUE ~ 0),
         white = case_when(race == 1 ~ 1, TRUE ~ 0), 
         cattend = case_when(attend > 3 ~ 1, TRUE ~ 0), #at least once a month
         protcattend = case_when(attend > 3 & relig == 1 ~ 1, TRUE ~ 0), #combine with denomination (protestant)
         prot = case_when(relig == 1 ~ 1, TRUE ~ 0)) %>%
  filter(!partyid == 8) #note: we added one earlier we filter other party responses in the GSS. The ANES does not have such a PID category.

# Filter missing responses
gss_Dat <- gss_Dat %>%
  filter(if_any(c(age, cohort, white, protcattend, college, partyid), ~ !is.na(.)))

# Create cohort and age classifications ('genPew' is the main categorization based on the Pew Research Center)
## 'genDec' codes cohorts by birth decennial (Appendix)
gss_Dat <- gss_Dat %>%
  mutate(
    genDec = cut(
      cohort, 
      breaks = c(1909, 1919, 1929, 1939, 1949, 1959, 1969, 1979, 1989, 1999),
      labels = c('1910-19', '1920-29', '1930-39', '1940-49', '1950-59', '1960-69', '1970-79', '1980-89', '1990-99')
    ),
    genPew = cut(
      cohort, 
      breaks = c(1909, 1927, 1945, 1964, 1980, 1996),
      labels = c('Greatest', 'Silent', 'Boomer', 'Gen X', 'Millennial')
    ),
    age = as.numeric(age),
    year = as.factor(year),
    ageGroup = cut(
      age, 
      breaks = c(15, 21, 29, 64, 100),
      labels = c('17-21', '22-29', '30-64', '65+')
    )
  ) %>%
  # Set reference categories APC
  mutate(
    year = relevel(year, ref = "1996"),
    genPew = relevel(genPew, ref = "Boomer"),
    ageGroup = relevel(ageGroup, ref = "30-64"),
    genDec = relevel(genDec, ref = "1950-59"))

# Filter 'Unnamed and GenZ' respondents as we don't have enough respondents for these cohorts
# Set reference categories
gss_Dat <- gss_Dat %>%
  filter(!genPew == "Unnamed" & !genPew == "Gen Z (1997-)") %>% 
  mutate(year = relevel(year, ref = "1996")) %>%
  mutate(ageGroup = relevel(ageGroup, ref = "30-64")) %>%
  droplevels()

# Save data frame
save(gss_Dat, file = "D:/dv_rep/Datasets/gss_Dat.Rdata")
